Related work: "Low-dimensional, free-energy landscapes of protein-folding reactions by nonlinear dimensionality reduction" (Das et al., 2006)

Notes:

Questions:

How generalizable are the learned reaction coordinates both (1) among many shorter trajectories of the same system and (2) between different systems?
What are the reaction coordinates being learned?



In [8]:

    
from __future__ import print_function
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm

from msmbuilder.decomposition import tICA
from msmbuilder.example_datasets import fetch_met_enkephalin
from msmbuilder.featurizer import AtomPairsFeaturizer
from sklearn.pipeline import Pipeline

%matplotlib inline



In [9]:

    
dataset = fetch_met_enkephalin()
print(dataset.DESCR)









    



The dataset consists of ten ~50 ns molecular dynamics (MD) simulation
trajectories of the 5 residue Met-enkaphalin peptide. The aggregate
sampling is 499.58 ns. Simulations were performed starting from the 1st
model in the 1PLX PDB file, solvated with 832 TIP3P water molecules using
OpenMM 6.0. The coordinates (protein only -- the water was stripped)
are saved every 5 picoseconds. Each of the ten trajectories is roughly
50 ns long and contains about 10,000 snapshots.

Forcefield: amber99sb-ildn; water: tip3p; nonbonded method: PME; cutoffs:
1nm; bonds to hydrogen were constrained; integrator: langevin dynamics;
temperature: 300K; friction coefficient: 1.0/ps; pressure control: Monte
Carlo barostat (interval of 25 steps); timestep 2 fs.

The dataset is available on figshare at

http://dx.doi.org/10.6084/m9.figshare.1026324



In [10]:

    
def fit_and_plot(pipeline, trajectories):
    transformed = pipeline.fit_transform(trajectories)
    transformed = np.concatenate(transformed)

    print('Eiegenvaue sum', pipeline.named_steps['tica'].eigenvalues_.sum())

    x = transformed[:, 0]
    y = transformed[:, 1]

    plt.axes(axisbg='w')
    plt.grid(False)
    plt.hist2d(x, y, bins=100, cmap='hot_r', norm=LogNorm())
    plt.title('Heatmap (log color scale)')
    plt.colorbar()



In [11]:

    
from itertools import combinations
heavy_atoms = dataset.trajectories[0].topology.select_atom_indices('heavy')
heavy_pairs = list(combinations(heavy_atoms, 2))

 
pipeline1 = Pipeline([
    ('feat', AtomPairsFeaturizer(heavy_pairs)),
    ('tica', tICA(gamma=0, n_components=2)),
    ('msm', MarkovStateModel())
])

fit_and_plot(pipeline1, dataset.trajectories)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-b72d89ccee69> in <module>()
     10 ])
     11 
---> 12 fit_and_plot(pipeline1, dataset.trajectories)

<ipython-input-10-813fc87132cb> in fit_and_plot(pipeline, trajectories)
      1 def fit_and_plot(pipeline, trajectories):
----> 2     transformed = pipeline.fit_transform(trajectories)
      3     transformed = np.concatenate(transformed)
      4 
      5     print('Eiegenvaue sum', pipeline.named_steps['tica'].eigenvalues_.sum())

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    137         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    138         if hasattr(self.steps[-1][-1], 'fit_transform'):
--> 139             return self.steps[-1][-1].fit_transform(Xt, y, **fit_params)
    140         else:
    141             return self.steps[-1][-1].fit(Xt, y, **fit_params).transform(Xt)

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/base.pyc in fit_transform(self, X, y, **fit_params)
    424         if y is None:
    425             # fit method of arity 1 (unsupervised transformation)
--> 426             return self.fit(X, **fit_params).transform(X)
    427         else:
    428             # fit method of arity 2 (supervised transformation)

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/msmbuilder-3.0.0_beta-py2.7-macosx-10.6-x86_64.egg/msmbuilder/msm/msm.pyc in fit(self, sequences, y)
    151         NaN or None.
    152         """
--> 153         sequences = list_of_1d(sequences)
    154         # step 1. count the number of transitions
    155         if int(self.lag_time) <= 0:

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/msmbuilder-3.0.0_beta-py2.7-macosx-10.6-x86_64.egg/msmbuilder/utils/validation.pyc in list_of_1d(y)
     18             raise ValueError(
     19                 "Bad input shape. Element %d has shape %s, but "
---> 20                 "should be 1D" % (i, str(value.shape)))
     21         result.append(value)
     22     return result

ValueError: Bad input shape. Element 0 has shape (9979, 2), but should be 1D



In [12]:

    
from msmbuilder.msm import MarkovStateModel



In [13]:

    
from sklearn import manifold



In [14]:

    
im = manifold.Isomap()



In [15]:

    
pipeline2 = Pipeline([
    ('feat', AtomPairsFeaturizer(heavy_pairs)),
    ('tica', tICA(n_components=2))
])



In [16]:

    
pipeline3 = Pipeline([
    ('feat', AtomPairsFeaturizer(heavy_pairs)),
    ('tica', tICA(n_components=2)),
    ('msm',MarkovStateModel())
])



In [17]:

    
traj = dataset.trajectories
fit_and_plot(pipeline2, traj)









    



Eiegenvaue sum 1.95731566278



In [67]:

    
tica = tICA(n_components=780)



In [68]:

    
feat = AtomPairsFeaturizer(heavy_pairs)
featurized = feat.fit_transform(traj)



In [68]:



In [69]:

    
len(featurized)









    Out[69]:





10



In [70]:

    
featurized[0].shape









    Out[70]:





(9979, 780)



In [71]:

    
Y = tica.fit_transform(featurized)



In [72]:

    
len(Y)









    Out[72]:





10



In [73]:

    
pl.plot(tica.eigenvalues_)









    Out[73]:





[<matplotlib.lines.Line2D at 0x1166ef950>]



In [74]:

    
tica_transform = tica.transform(featurized)



In [75]:

    
X_ = np.vstack(tica_transform)
X_.shape









    Out[75]:





(99916, 780)



In [76]:

    
from sklearn.decomposition import PCA



In [77]:

    
pca = PCA()



In [78]:

    
pca.fit(X_)









    Out[78]:





PCA(copy=True, n_components=None, whiten=False)



In [79]:

    
pl.plot(pca.explained_variance_ratio_)









    Out[79]:





[<matplotlib.lines.Line2D at 0x1177dbd50>]



In [81]:

    
pl.hist2d(pca.transform(X_)[:,0],pca.transform(X_)[:,1],bins=50);



In [83]:

    
pl.hist2d(tica.transform([X_])[:,0],tica.transform([X_])[:,1],bins=50);









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-83-4f1584de8615> in <module>()
----> 1 pl.hist2d(tica.transform([X_])[:,0],tica.transform([X_])[:,1],bins=50);

TypeError: list indices must be integers, not tuple



In [30]:

    
import pylab as pl
pl.plot(tica.timescales_)









    Out[30]:





[<matplotlib.lines.Line2D at 0x117174690>]



In [28]:

    
len(Y)









    Out[28]:





10



In [22]:

    
Y[0].shape









    Out[22]:





(9979, 100)



In [23]:

    
Y_stack = np.vstack(Y)



In [24]:

    
Y_im_t = im.fit_transform(Y_stack.T)
plt.hist2d(Y_im_t[:,0],Y_im_t[:,1],bins=50);



In [60]:

    
plt.scatter(Y_im_t[:,0],Y_im_t[:,1]);



In [33]:

    
featurized_stack = np.vstack(featurized)



In [38]:

    
Y_im = im.fit_transform(featurized_stack.T)



In [39]:

    
Y_im.shape









    Out[39]:





(780, 2)



In [41]:

    
plt.hist2d(Y_im[:,0],Y_im[:,1],bins=50);



In [ ]: